#This project aims to analyse a kaggle dataset containing professional football players across the top 5 European leagues
#The data contains the player names, position, team, age and their stats for the season
#My goal is to clean the dataset, perform feature engineering,
# create visuals to showcase the best performing players and teams, the age and nationality differences,
# and finally to implement machine learning algorithms based on my work, with the aim to create an accurate predictive model
#Import file
import pandas as pd
import numpy as np
inpath = "C:/Users/ryans/Downloads/"
filename = "2022-2023 Football Player Stats.csv"
players_df = pd.read_csv(inpath + filename, delimiter=';', header=0, index_col=None, encoding='cp1252')
players_df
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Off | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.17 | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.05 | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 0.00 | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.00 | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.00 | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2684 | 2685 | Szymon ?urkowski | POL | MF | Fiorentina | Serie A | 25 | 1997 | 2 | 0 | ... | 0.00 | 2.50 | 0.00 | 0.0 | 0.0 | 0.00 | 5.00 | 2.50 | 2.50 | 50.0 |
| 2685 | 2686 | Szymon ?urkowski | POL | MF | Spezia | Serie A | 25 | 1997 | 1 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 30.00 | 0.00 | 0.00 | 0.0 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.05 | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 |
| 2687 | 2688 | Milan ?uri? | BIH | FW | Hellas Verona | Serie A | 32 | 1990 | 16 | 7 | ... | 0.38 | 0.00 | 0.13 | 0.0 | 0.0 | 0.00 | 2.05 | 14.50 | 3.59 | 80.1 |
| 2688 | 2689 | Filip ?uri?i? | SRB | MFFW | Sampdoria | Serie A | 31 | 1992 | 20 | 17 | ... | 0.00 | 1.56 | 0.88 | 0.0 | 0.0 | 0.00 | 4.97 | 0.54 | 0.88 | 38.1 |
2689 rows × 124 columns
#Data cleaning
#isna shows how many values are na
players_df.isna().sum(axis=0)
Rk 0
Player 0
Nation 1
Pos 0
Squad 0
..
OG 0
Recov 0
AerWon 0
AerLost 0
AerWon% 0
Length: 124, dtype: int64
#Display unique player names, check for any anomalies/outliers
print("Unique names:", players_df['Player'].unique())
print('\n')
# Ensure all positions are valid
print("Unique positions:", players_df['Pos'].unique())
print('\n')
#How many competitions/leagues are present in the dataset
leagues = players_df['Comp'].unique()
print("Leagues:", leagues)
Unique names: ['Brenden Aaronson' 'Yunis Abdelhamid' 'Himad Abdelli' ... 'Martin Ødegaard' 'Milan ?uri?' 'Filip ?uri?i?'] Unique positions: ['MFFW' 'DF' 'MF' 'FWMF' 'FW' 'DFFW' 'MFDF' 'GK' 'DFMF' 'FWDF'] Leagues: ['Premier League' 'Ligue 1' 'Serie A' 'La Liga' 'Bundesliga']
#Remove players whose names contain '?' as the data is dirty and not accurate, notice 105 rows are removed
players_df = players_df[~players_df['Player'].str.contains('\?')]
players_df
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Off | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.17 | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.05 | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 0.00 | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.00 | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.00 | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.00 | 0.00 | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 |
| 2681 | 2682 | Petar Zovko | BIH | GK | Spezia | Serie A | 20 | 2002 | 1 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 2.50 | 0.00 | 0.00 | 0.0 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.00 | 0.21 | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 0.06 | 0.11 | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.05 | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 |
2584 rows × 124 columns
#Analyse Age values for outliers
#Interquartile Range method to find outliers
Q1 =players_df['Age'].quantile(0.25)
Q3 = players_df['Age'].quantile(0.75)
IQR = Q3 - Q1
#Define an outlier as being beyond 1.5 times the IQR from the quartiles
IQR = players_df['Age'].quantile(0.75) - players_df['Age'].quantile(0.25)
lower_bound = players_df['Age'].quantile(0.25) - 1.5 * IQR
upper_bound = players_df['Age'].quantile(0.75) + 1.5 * IQR
print('lower bound is:', lower_bound)
print('higher bound is:', upper_bound)
lower bound is: 14.0 higher bound is: 38.0
#Remove players that fall outside the age range of 14-38
#Notice 10 rows are removed
filtered_players_df = players_df[(players_df['Age'] >= lower_bound) & (players_df['Age'] <= upper_bound)]
filtered_players_df
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Off | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.17 | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.05 | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 0.00 | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.00 | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.00 | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.00 | 0.00 | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 |
| 2681 | 2682 | Petar Zovko | BIH | GK | Spezia | Serie A | 20 | 2002 | 1 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.00 | 2.50 | 0.00 | 0.00 | 0.0 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.00 | 0.21 | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 0.06 | 0.11 | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.05 | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 |
2574 rows × 124 columns
#Filter the DataFrame to include only rows where MP (matches played) is 5 or greater
#This removes rotational players and those who have maybe struggled with injury
filtered_players_df = filtered_players_df[filtered_players_df['MP'] >= 5]
filtered_players_df
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Off | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.17 | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.05 | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 0.00 | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.00 | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.00 | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2678 | 2679 | Nadir Zortea | ITA | DFMF | Atalanta | Serie A | 23 | 1999 | 9 | 1 | ... | 0.00 | 4.00 | 2.00 | 0.0 | 0.0 | 0.00 | 5.60 | 2.00 | 0.40 | 83.3 |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.00 | 0.00 | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.00 | 0.21 | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 0.06 | 0.11 | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.05 | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 |
1978 rows × 124 columns
#Display the first few rows of the dataset
print("Data Head:")
print('\n')
print(filtered_players_df.head())
print('\n')
#Show basic information about the dataset
print("Data Info:")
print('\n')
print(filtered_players_df.info())
print('\n')
#Display the shape of the dataset (rows, columns)
print("Dataframe shape:", filtered_players_df.shape)
Data Head: Rk Player Nation Pos Squad Comp Age \ 0 1 Brenden Aaronson USA MFFW Leeds United Premier League 22 1 2 Yunis Abdelhamid MAR DF Reims Ligue 1 35 2 3 Himad Abdelli FRA MFFW Angers Ligue 1 23 3 4 Salis Abdul Samed GHA MF Lens Ligue 1 22 4 5 Laurent Abergel FRA MF Lorient Ligue 1 30 Born MP Starts ... Off Crs TklW PKwon PKcon OG Recov AerWon \ 0 2000 20 19 ... 0.17 2.54 0.51 0.0 0.0 0.00 4.86 0.34 1 1987 22 22 ... 0.05 0.18 1.59 0.0 0.0 0.00 6.64 2.18 2 1999 14 8 ... 0.00 1.05 1.40 0.0 0.0 0.00 8.14 0.93 3 2000 20 20 ... 0.00 0.35 0.80 0.0 0.0 0.05 6.60 0.50 4 1993 15 15 ... 0.00 0.23 2.02 0.0 0.0 0.00 6.51 0.31 AerLost AerWon% 0 1.19 22.2 1 1.23 64.0 2 1.05 47.1 3 0.50 50.0 4 0.39 44.4 [5 rows x 124 columns] Data Info: <class 'pandas.core.frame.DataFrame'> Int64Index: 1978 entries, 0 to 2686 Columns: 124 entries, Rk to AerWon% dtypes: float64(112), int64(7), object(5) memory usage: 1.9+ MB None Dataframe shape: (1978, 124)
#Basic statistics for numerical features
print(filtered_players_df.describe())
Rk Age Born MP Starts \
count 1978.000000 1978.000000 1978.000000 1978.000000 1978.000000
mean 1330.520222 26.342770 1995.822042 14.721941 10.746208
std 774.631741 4.215106 4.224681 4.922211 6.366111
min 1.000000 16.000000 1984.000000 5.000000 0.000000
25% 665.250000 23.000000 1993.000000 11.000000 5.000000
50% 1323.000000 26.000000 1996.000000 15.500000 11.000000
75% 1993.500000 29.000000 1999.000000 19.000000 16.000000
max 2687.000000 38.000000 2006.000000 23.000000 23.000000
Min 90s Goals Shots SoT ... \
count 1978.000000 1978.000000 1978.000000 1978.000000 1978.000000 ...
mean 960.107179 10.668453 1.311931 1.258726 0.415940 ...
std 524.718825 5.830213 2.218073 1.048924 0.456647 ...
min 19.000000 0.200000 0.000000 0.000000 0.000000 ...
25% 515.000000 5.700000 0.000000 0.430000 0.050000 ...
50% 945.500000 10.500000 0.000000 0.980000 0.280000 ...
75% 1392.750000 15.500000 2.000000 1.910000 0.660000 ...
max 2070.000000 23.000000 25.000000 6.670000 4.000000 ...
Off Crs TklW PKwon PKcon \
count 1978.000000 1978.000000 1978.000000 1978.000000 1978.000000
mean 0.201532 1.660253 0.956512 0.011871 0.015677
std 0.355716 1.939174 0.637352 0.048743 0.050165
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.200000 0.510000 0.000000 0.000000
50% 0.060000 0.925000 0.900000 0.000000 0.000000
75% 0.260000 2.520000 1.320000 0.000000 0.000000
max 5.000000 11.700000 5.000000 0.870000 0.630000
OG Recov AerWon AerLost AerWon%
count 1978.000000 1978.000000 1978.000000 1978.000000 1978.000000
mean 0.003281 4.994434 1.296891 1.399393 47.577958
std 0.018318 1.958077 1.176698 1.158376 21.362866
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 3.820000 0.490000 0.710000 33.300000
50% 0.000000 5.090000 1.010000 1.150000 48.950000
75% 0.000000 6.250000 1.780000 1.740000 60.000000
max 0.290000 19.000000 12.500000 10.800000 100.000000
[8 rows x 119 columns]
#Count missing values in each column and filter to show only those with missing values
missing_values = filtered_players_df.isnull().sum()
missing_values_cols = missing_values[missing_values > 0]
print("Columns with missing values and their count:")
print(missing_values_cols)
print('\n')
#Count missing values in each column
print('Missing values in each column:')
print(filtered_players_df.isnull().sum())
Columns with missing values and their count:
Series([], dtype: int64)
Missing values in each column:
Rk 0
Player 0
Nation 0
Pos 0
Squad 0
..
OG 0
Recov 0
AerWon 0
AerLost 0
AerWon% 0
Length: 124, dtype: int64
#Data is now clean, change df name to clean_df
clean_df=filtered_players_df
clean_df
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Off | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.17 | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.05 | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 0.00 | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.00 | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.00 | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2678 | 2679 | Nadir Zortea | ITA | DFMF | Atalanta | Serie A | 23 | 1999 | 9 | 1 | ... | 0.00 | 4.00 | 2.00 | 0.0 | 0.0 | 0.00 | 5.60 | 2.00 | 0.40 | 83.3 |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.00 | 0.00 | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.00 | 0.21 | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 0.06 | 0.11 | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.05 | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 |
1978 rows × 124 columns
#Feature Engineering
#Create a new feature 'AgeGroup' to categorize players into different age groups
clean_df['AgeGroup'] = pd.cut(clean_df['Age'], bins=[0, 20, 25, 30, 35, 40], labels=['<20', '20-25', '25-30', '30-35', '35-40'])
clean_df
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\1134519217.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy clean_df['AgeGroup'] = pd.cut(clean_df['Age'], bins=[0, 20, 25, 30, 35, 40], labels=['<20', '20-25', '25-30', '30-35', '35-40'])
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | Crs | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | AgeGroup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 2.54 | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 | 20-25 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 0.18 | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 | 30-35 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 1.05 | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 | 20-25 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.35 | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 | 20-25 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 0.23 | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 | 25-30 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2678 | 2679 | Nadir Zortea | ITA | DFMF | Atalanta | Serie A | 23 | 1999 | 9 | 1 | ... | 4.00 | 2.00 | 0.0 | 0.0 | 0.00 | 5.60 | 2.00 | 0.40 | 83.3 | 20-25 |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.00 | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 | 25-30 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.21 | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 | 20-25 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 0.11 | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 | 20-25 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 2.51 | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 | 20-25 |
1978 rows × 125 columns
#The AgeGroup variable is categorical ordinal data. We use label encoding to convert it to numerical data
#Label encoding (encoding dictionary) is used on categorical data, including ordinal data, and converts the categorical labels into unique numerical representations.
# E.g: 1. low / 2. medium / 3. high. There is an order here.
#Display all the possible values for the AgeGroup variable (use unique)
clean_df['AgeGroup'].unique()
['20-25', '30-35', '25-30', '<20', '35-40'] Categories (5, object): ['<20' < '20-25' < '25-30' < '30-35' < '35-40']
#Create the encoding dictionary
encoding_dict = {'<20': 1, '20-25': 2, '25-30': 3, '30-35': 4, '35-40': 5}
encoding_dict
{'<20': 1, '20-25': 2, '25-30': 3, '30-35': 4, '35-40': 5}
#Then apply it to 'Age' using pandas and store it in a ‘Age_encoded_pd’ variable
# Apply the encoding dictionary to 'can_inc_cha_ope_sea'
clean_df['Age_encoded_pd'] = clean_df['AgeGroup'].map(encoding_dict)
# Display the resulting DataFrame
clean_df
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\2690453717.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy clean_df['Age_encoded_pd'] = clean_df['AgeGroup'].map(encoding_dict)
| Rk | Player | Nation | Pos | Squad | Comp | Age | Born | MP | Starts | ... | TklW | PKwon | PKcon | OG | Recov | AerWon | AerLost | AerWon% | AgeGroup | Age_encoded_pd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Brenden Aaronson | USA | MFFW | Leeds United | Premier League | 22 | 2000 | 20 | 19 | ... | 0.51 | 0.0 | 0.0 | 0.00 | 4.86 | 0.34 | 1.19 | 22.2 | 20-25 | 2 |
| 1 | 2 | Yunis Abdelhamid | MAR | DF | Reims | Ligue 1 | 35 | 1987 | 22 | 22 | ... | 1.59 | 0.0 | 0.0 | 0.00 | 6.64 | 2.18 | 1.23 | 64.0 | 30-35 | 4 |
| 2 | 3 | Himad Abdelli | FRA | MFFW | Angers | Ligue 1 | 23 | 1999 | 14 | 8 | ... | 1.40 | 0.0 | 0.0 | 0.00 | 8.14 | 0.93 | 1.05 | 47.1 | 20-25 | 2 |
| 3 | 4 | Salis Abdul Samed | GHA | MF | Lens | Ligue 1 | 22 | 2000 | 20 | 20 | ... | 0.80 | 0.0 | 0.0 | 0.05 | 6.60 | 0.50 | 0.50 | 50.0 | 20-25 | 2 |
| 4 | 5 | Laurent Abergel | FRA | MF | Lorient | Ligue 1 | 30 | 1993 | 15 | 15 | ... | 2.02 | 0.0 | 0.0 | 0.00 | 6.51 | 0.31 | 0.39 | 44.4 | 25-30 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2678 | 2679 | Nadir Zortea | ITA | DFMF | Atalanta | Serie A | 23 | 1999 | 9 | 1 | ... | 2.00 | 0.0 | 0.0 | 0.00 | 5.60 | 2.00 | 0.40 | 83.3 | 20-25 | 2 |
| 2680 | 2681 | Kurt Zouma | FRA | DF | West Ham | Premier League | 28 | 1994 | 15 | 15 | ... | 0.14 | 0.0 | 0.0 | 0.00 | 3.94 | 2.61 | 0.35 | 88.1 | 25-30 | 3 |
| 2682 | 2683 | Igor Zubeldia | ESP | DF | Real Sociedad | La Liga | 25 | 1997 | 16 | 14 | ... | 0.86 | 0.0 | 0.0 | 0.00 | 4.93 | 2.00 | 1.57 | 56.0 | 20-25 | 2 |
| 2683 | 2684 | Martín Zubimendi | ESP | MF | Real Sociedad | La Liga | 24 | 1999 | 19 | 18 | ... | 1.07 | 0.0 | 0.0 | 0.00 | 6.18 | 2.19 | 0.90 | 70.9 | 20-25 | 2 |
| 2686 | 2687 | Martin Ødegaard | NOR | MF | Arsenal | Premier League | 24 | 1998 | 20 | 20 | ... | 0.70 | 0.0 | 0.0 | 0.00 | 5.19 | 0.48 | 0.70 | 40.9 | 20-25 | 2 |
1978 rows × 126 columns
#I will keep the Age variable as I will use it for the visualizations below
#Let's visualize the data
#Visualization 1: Distribution of player ages
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(clean_df['Age'], kde=True, bins=20)
plt.title('Distribution of Player Ages')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# Visualization 2
#Create a bar chart displaying the top 20 teams with the highest goals scored
#Aggregate the goals by team
goal_distribution = clean_df.groupby(['Squad', 'Comp'])['Goals'].sum().reset_index()
goal_distribution
| Squad | Comp | Goals | |
|---|---|---|---|
| 0 | Ajaccio | Ligue 1 | 16 |
| 1 | Almería | La Liga | 22 |
| 2 | Angers | Ligue 1 | 15 |
| 3 | Arsenal | Premier League | 41 |
| 4 | Aston Villa | Premier League | 24 |
| ... | ... | ... | ... |
| 93 | Villarreal | La Liga | 21 |
| 94 | Werder Bremen | Bundesliga | 29 |
| 95 | West Ham | Premier League | 18 |
| 96 | Wolfsburg | Bundesliga | 35 |
| 97 | Wolves | Premier League | 13 |
98 rows × 3 columns
#Sort the teams by goals scored
goal_distribution = goal_distribution.sort_values('Goals', ascending=False)
goal_distribution
| Squad | Comp | Goals | |
|---|---|---|---|
| 11 | Bayern Munich | Bundesliga | 58 |
| 68 | Paris S-G | Ligue 1 | 54 |
| 55 | Manchester City | Premier League | 52 |
| 63 | Napoli | Serie A | 48 |
| 59 | Monaco | Ligue 1 | 48 |
| ... | ... | ... | ... |
| 24 | Cádiz | La Liga | 13 |
| 97 | Wolves | Premier League | 13 |
| 92 | Valladolid | La Liga | 12 |
| 36 | Hellas Verona | Serie A | 11 |
| 77 | Sampdoria | Serie A | 8 |
98 rows × 3 columns
#Define a color map for each league
league_colors = {
'Premier League': 'blue',
'La Liga': 'red',
'Bundesliga': 'orange',
'Serie A': 'green',
'Ligue 1': 'purple'
}
league_colors
{'Premier League': 'blue',
'La Liga': 'red',
'Bundesliga': 'orange',
'Serie A': 'green',
'Ligue 1': 'purple'}
#Assign colors to each team based on their league
team_colors = [league_colors.get(league, 'gray') for league in goal_distribution['Comp']]
#Create custom legend items: coloured patches
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
#Create the bar chart, but only keep the top 20 teams
plt.figure(figsize=(20, 10))
goal_distribution = goal_distribution.sort_values('Goals', ascending=False).head(20)
bars = plt.bar(goal_distribution['Squad'], goal_distribution['Goals'], color=team_colors, width=0.8)
#Adjust x-axis labels
plt.xticks(rotation=90, ha='right', fontsize=10)
#Adjust the y-axis label
plt.yticks(fontsize=10)
#Create a legend and move it outside the plot area
legend_items = [Patch(facecolor=league_colors[league], label=league) for league in league_colors]
plt.legend(handles=legend_items, title="Leagues", fontsize=20, loc='upper left', bbox_to_anchor=(1, 1))
plt.title('Goal Distribution Across Teams', fontsize=20)
plt.xlabel('Teams', fontsize=20)
plt.ylabel('Total Goals Scored', fontsize=20)
plt.tight_layout(pad=2)
plt.show()
#The goal distribution can be seen as fairly even across the 5 leagues
#Visualization 4 - Horizontal bar chart for top 5 goalscorers in each league
#Top 5 goalscorers in each league
top_scorers = clean_df.groupby(['Comp', 'Player'])['Goals'].sum().reset_index()
#Sort the scorers within each league
top_scorers = top_scorers.sort_values(['Comp', 'Goals'], ascending=[True, False])
#Let's take the top 5 scorers from each league for the visualization
top_scorers_per_league = top_scorers.groupby('Comp').head(5)
#Plot the horizontal bar chart
plt.figure(figsize=(10, 8))
#Create a color list matching the leagues in top_scorers_per_league
#I want to keep the league colours consistent
legend = [league_colors[league] for league in top_scorers_per_league['Comp'].unique()]
#Now, use the palette in the barplot
sns.barplot(x='Goals', y='Player', hue='Comp', data=top_scorers_per_league, dodge=False, palette=legend)
plt.title('Top 5 Scorers in Each League')
plt.xlabel('Total Goals Scored')
plt.ylabel('Player')
plt.legend(title='League')
plt.tight_layout()
plt.show()
#The top goal scorers seem fairly even across the 5 leagues, with Erling Haaland being the only outlier
#The above graph shows the top goalscorers, but it doesn't factor that some leagues have played more games than others.
#Also, assists are also important metrics to show how effective players are for their team.
#Let's create a column showing goals and assists (goal contributions) per 90 minutes played (the length of a match).
#This shows who is performing well, based on how many minutes they've played overall this season.
#We'll store this in the ['GA_Per90'] variable.
#Visualization 5 - Goal contributions per 90 minutes
#Calculate goals and assists per 90 minutes
clean_df['GoalsPer90'] = (clean_df['Goals'] / clean_df['Min']) * 90
clean_df['AssistsPer90'] = (clean_df['Assists'] / clean_df['Min']) * 90
#Create a new column for goals and assists per 90 minutes
clean_df['GA_Per90'] = clean_df['GoalsPer90'] + clean_df['AssistsPer90']
#Filter players with a reasonable amount of minutes played
min_minutes_threshold = 900
top_players = clean_df[clean_df['Min'] > min_minutes_threshold]
#Sort the players by goals and assists per 90 minutes
top_players = top_players.sort_values('GA_Per90', ascending=False)
#We'll take the top 20 players for the visualization
top_players = top_players.head(20)
#Assign colors to each player based on their league. Notice the league_colors variable from before
player_colors = [league_colors.get(league, 'gray') for league in top_players['Comp']]
#Plot the data
plt.figure(figsize=(14, 8))
sns.barplot(x='GA_Per90', y='Player', data=top_players, palette=player_colors)
#Add a legend for leagues
legend_items = [Patch(facecolor=league_colors[league], label=league) for league in league_colors]
plt.legend(handles=legend_items, title="Leagues")
plt.title('Top Players - Goal Contributions per 90 Minutes')
plt.xlabel('G/A per 90 Minutes')
plt.ylabel('Player')
plt.show()
C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy clean_df['GoalsPer90'] = (clean_df['Goals'] / clean_df['Min']) * 90 C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy clean_df['AssistsPer90'] = (clean_df['Assists'] / clean_df['Min']) * 90 C:\Users\ryans\AppData\Local\Temp\ipykernel_23776\772913004.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy clean_df['GA_Per90'] = clean_df['GoalsPer90'] + clean_df['AssistsPer90']
#Visualization 6 - Box Plot showing Age Distribution within each league
plt.figure(figsize=(14, 7))
sns.boxplot(x='Comp', y='Age', data=clean_df)
plt.title('Age Distribution within the 5 Leagues')
plt.xticks(rotation=45)
plt.xlabel('League')
plt.ylabel('Age')
plt.show()
#The ages are dispersed evenly, with the mean in La Liga being slightly higher
#Visualization 7 - Treemap to show different nationalities across the different leagues
import plotly.express as px
#Group by league and nationality and count the number of players
nationality_counts = clean_df.groupby(['Comp', 'Nation']).size().reset_index(name='Count')
#Plotting the TreeMap
fig = px.treemap(nationality_counts, path=['Comp', 'Nation'], values='Count', color='Nation',
title='Nationalities of Players in Each League',
color_continuous_scale='Rainbow')
fig.show()
#Visualization 8
#I want to create another treemap showing the unique nationality count within each league
#Group by league and count the unique number of nationalities
nationality_counts = clean_df.groupby('Comp')['Nation'].nunique().reset_index(name='Unique Nationalities')
#Since I am now interested in the count of nationalities rather than players, I adjust the values parameter
fig = px.treemap(nationality_counts, path=['Comp'], values='Unique Nationalities',
title='Count of Nationalities in Each League',
color='Unique Nationalities', color_continuous_scale='Rainbow')
fig.show()
#As observed by the visual,
#Serie A has the most diversification of nationalities (63), while Premier league has the least (51).
#Machine Learning
#Let's use Random forest to predict Goals, based on other key variables using the variables present,
#and the variables I have created
#RandomForest algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Assuming 'Goals' is the target variable and other relevant features are selected
X = clean_df[['Age_encoded_pd', 'GoalsPer90', 'AssistsPer90', 'Int', 'ToSuc%', 'Carries', 'CarTotDist', ]] # Example features
y = clean_df['Goals'] # Target variable
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
# Initialize and train the Random Forest classifier, with 100 trees in the forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
# Make predictions and evaluate the model
predictions = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
Accuracy: 0.803030303030303
#Decision trees algorithm
from sklearn.tree import DecisionTreeClassifier
#Set the algorithm parameters and initialize the model
dt_model = DecisionTreeClassifier(random_state=0)
#Fit the model to the training data and make predictions on the test data
y_pred_dt = dt_model.fit(X_train, y_train).predict(X_test)
#Evaluate model performance using accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}")
Decision Tree Accuracy: 0.80
#Both models seem to be accurate, at around 80%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
#Select features and target
xdf = clean_df[['Age', 'GoalsPer90', 'AssistsPer90', 'Int', 'ToSuc%', 'Carries', 'CarTotDist']]
ydf = clean_df['Goals'] #Target variable, what I want to predict
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(xdf, ydf, test_size=0.3, random_state=40)
#Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#GradientBoostingRegressor algorithm
#Gradient Boosting is a boosting technique that builds one tree at a time,
#where each new tree helps to correct errors made by previously trained trees.
#The "Gradient" part refers to the use of gradient descent algorithm to minimize the loss when adding new models.
from sklearn.ensemble import GradientBoostingRegressor
#Initialize and train the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr_model.fit(X_train_scaled, y_train)
#Make predictions and evaluate the model
gbr_predictions = gbr_model.predict(X_test_scaled)
print("Gradient Boosting Regressor MSE:", mean_squared_error(y_test, gbr_predictions))
print("Gradient Boosting Regressor R2:", r2_score(y_test, gbr_predictions))
Gradient Boosting Regressor MSE: 0.937768508455521 Gradient Boosting Regressor R2: 0.8100557645425788
#Visualization 9 - Feature Importance
from sklearn.ensemble import RandomForestRegressor
#Initialize the Random Forest model
rf = RandomForestRegressor(n_estimators=100)
#Fit the model
rf.fit(X_train, y_train)
#Get feature importances
importances = rf.feature_importances_
#Print feature importances
for i, feature in enumerate(X_train.columns):
print(f"{feature}: {importances[i]}")
features = X_train.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), features[indices], rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()
Age: 0.017294580244490917 GoalsPer90: 0.762103059321091 AssistsPer90: 0.13908120136472957 Int: 0.027288563268093452 ToSuc%: 0.018204194504547452 Carries: 0.01913214858401267 CarTotDist: 0.016896252713034764
#GoalsPer90 and AssistsPer90 are seen as the most important features, which are the features I created